Author

Rachel Friesen

Import Data and Create Data Preview

Code
# read in csv
names <- read.csv(here('labs', 'lab_9', 'StateNames_A.csv'))

# interactive preview with DT
DT::datatable(names)

Summarize the Number of Allisons

Code
allison <-names |> 
  rename(Sex = Gender) |> 
  filter(Name == "Allison")

  
allison |> 
  group_by(State, Sex) |> 
  summarize(Count = sum(Count)) |>
  ungroup() |> 
  pivot_wider(names_from = Sex,
              values_from = Count,
              values_fill = 0) |> 
  gt() |> 
  cols_label(F = "Female",
             M = "Male") |> 
  tab_header(title = "Frequency of Babies Named 'Allison' by U.S. State") |> 
  opt_align_table_header(align = c("left")) |> 
  cols_width(everything()~ px(100))
Frequency of Babies Named 'Allison' by U.S. State
State Female Male
AK 232 0
AL 1535 0
AR 1198 0
AZ 1880 0
CA 12413 0
CO 1594 0
CT 1099 0
DC 321 0
DE 294 0
FL 4455 0
GA 3257 0
HI 183 0
IA 1477 0
ID 451 0
IL 5110 0
IN 3067 0
KS 1283 0
KY 1905 20
LA 1209 0
MA 2218 0
MD 2229 0
ME 340 0
MI 4014 0
MN 2374 0
MO 2882 0
MS 817 0
MT 226 0
NC 3435 0
ND 285 0
NE 807 0
NH 412 0
NJ 3052 0
NM 399 0
NV 729 0
NY 5747 0
OH 5487 0
OK 1421 0
OR 1186 0
PA 4307 0
RI 306 0
SC 1228 0
SD 376 0
TN 2488 0
TX 10192 0
UT 1125 0
VA 3220 0
VT 135 0
WA 1956 0
WI 2367 0
WV 813 0
WY 142 0

Visualize the Data

Code
# filter out only females
allison_f <- allison |>
  filter(Sex == "F")

allison_f <- allison_f |> 
  group_by(Year) |> 
  summarize(Count = sum(Count))

ggplot(data = allison_f,
       mapping = aes(x = Year, y = Count)) +
  geom_point() +
  geom_line()+
  stat_smooth(method = "lm") + 
  labs(title = "Number of Babies Named 'Allison' Over Time")

Model the Number of Allisons

Linear Regression

The estimated regression equation is:

y = -102x +209690

Code
allison_lm <- lm(Count ~ Year, data = allison_f)
broom::tidy(allison_lm)
# A tibble: 2 × 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)  209690.   42972.       4.88 0.000167
2 Year           -102.      21.4     -4.74 0.000223

Residuals

The residuals do not have equal variance. The fitted values of 6000-6500 have residual values of around -500.

Code
allison_lm |> 
  broom::augment() |> 
  ggplot(mapping = aes(y = .resid, x = .fitted)) +
  geom_point()

The name Allison appears to be decreasing in popularity. In general, every year there are 102 fewer children named Allison in the US.

Allan Spellings by State

Plot the popularity of the name Allan over time

Code
allan_variants <- c("Allan", "Alan", "Allen")

allan_m <- names |> 
  rename(Sex = Gender) |> 
  filter(Sex == "M",
         Name %in% allan_variants)

allan_plot <- allan_m |> 
  group_by(Year) |> 
  summarize(Count = sum(Count))

ggplot(data = allan_plot,
       mapping = aes(x = Year, y = Count)) +
  geom_point() +
  geom_line()+
  stat_smooth(method = "lm") + 
  labs(title = "Number of Babies Named 'Allan' Over Time")

Make a table comparing Allan spellings in CA and PA

Code
allan_m |> 
  filter(Year == 2000,
         State %in% c("CA", "PA")) |>
  group_by(State) |> 
  mutate(prop = Count / sum(Count)) |> 
  select(-c(Year, Sex, Count)) |> 
  pivot_wider(names_from = Name,
              values_from = prop,
              values_fill = 0) |>
  ungroup() |> 
  gt() |> 
  tab_header(title = "Percent of Babies Named 'Alan'",
             subtitle = "Comparing California and Pennsylvania") |> 
  fmt_percent(columns = 2:4, decimals = 2) |> 
  opt_align_table_header(align = c("left"))  |> 
  cols_width(everything()~ px(100))
Percent of Babies Named 'Alan'
Comparing California and Pennsylvania
State Alan Allen Allan
CA 65.35% 19.86% 14.79%
PA 42.86% 47.06% 10.08%